Author

Chloe Feehan

Code
library(tidyverse)
library(here)
library(knitr)
library(kableExtra)
library(gt)
library(DT)

names <- read_csv(here("Labs", "Lab 9", "StateNames_A.csv"))
Code
DT::datatable(names)

Question 1

Make a summary table of the number of babies named “Allison” for each state and the sex of the baby. Specifically, the table should have the following qualities: - each state should be its own row - and each sex should have its own column - if there were no babies born for that combination of state & sex there should be a 0 (not an NA)

Code
names |>
  rename(Sex = Gender) |>
  filter(Name == "Allison") |>
  select(State, Sex, Count) |>
  group_by(State, Sex) |>
  summarize(
    Count = sum(Count)
  ) |>
  pivot_wider(names_from = Sex,
              values_from = Count,
              values_fill = 0) |>
  knitr::kable(format = "html",
               col.names =
                 c("State", "Female", "Male"),
               caption = "Number of Babies Born with the Name Allison By Sex and State"
               ) |>
  kableExtra::kable_classic(html_font = "Optima") |>
  kable_styling(bootstrap_options = "striped", font_size = 14)
Number of Babies Born with the Name Allison By Sex and State
State Female Male
AK 232 0
AL 1535 0
AR 1198 0
AZ 1880 0
CA 12413 0
CO 1594 0
CT 1099 0
DC 321 0
DE 294 0
FL 4455 0
GA 3257 0
HI 183 0
IA 1477 0
ID 451 0
IL 5110 0
IN 3067 0
KS 1283 0
KY 1905 20
LA 1209 0
MA 2218 0
MD 2229 0
ME 340 0
MI 4014 0
MN 2374 0
MO 2882 0
MS 817 0
MT 226 0
NC 3435 0
ND 285 0
NE 807 0
NH 412 0
NJ 3052 0
NM 399 0
NV 729 0
NY 5747 0
OH 5487 0
OK 1421 0
OR 1186 0
PA 4307 0
RI 306 0
SC 1228 0
SD 376 0
TN 2488 0
TX 10192 0
UT 1125 0
VA 3220 0
VT 135 0
WA 1956 0
WI 2367 0
WV 813 0
WY 142 0

Question 2

You should have seen in the table above that “Allison” is a name given overwhelmingly to babies assigned “female” at birth. So, create a new dataset named allison_f which contains only the babies assigned Female at birth.

Code
allison_f <- names |>
  rename("Sex" = "Gender") |>
  filter(Name == "Allison") |>
  select(Year, Sex, State, Count)

Question 3

Make a visualization showing how the popularity of the name “Allison” has changed over the years. To be clear, each year should have one observation–the total number of Allisons born that year.

Code
allison_f |>
  group_by(Year) |>
  summarize(
    Count = sum(Count)
  ) |>
  ggplot(mapping = (aes(x = Year, y = Count))) +
  geom_line(color = "darkblue") +
  scale_x_continuous(breaks = c("1997":"2014"), 
                     guide = guide_axis(n.dodge = 2)) +
  scale_y_continuous(n.breaks = 8) +
  labs(x = "Year", y = "", 
       title = "Number of Babies Named Allison from 1997 to 2014") +
  theme(text = element_text(family = "Optima"))

Code
#https://ggplot2.tidyverse.org/reference/scale_continuous.html

Question 4

Fit a linear model with the year as the explanatory variable, and the number of Allisons as the response. Similar to #3, each year should have one observation–the total number of Allisons born that year.

Code
allison_f |>
  group_by(Year) |>
  summarize(
    Count = sum(Count)
  ) |>
  ggplot(mapping = aes(x = Year, y = Count)) +
  geom_point(color = "darkblue") +
  geom_smooth(method = "lm", fill = "lightblue", color = "aquamarine4") +
  scale_x_continuous(breaks = c("1997":"2014"), 
                     guide = guide_axis(n.dodge = 2)) +
  scale_y_continuous(n.breaks = 8) +
  labs(x = "Year", y = "Count", title = "Number of Babies Named Allison from 1997 to 2014") +
  theme(text = element_text(family = "Optima"))

Question 5

Code
allison_f |>
  group_by(Year) |>
  summarize(
    Count = sum(Count)
  ) |>
  lm(Count ~ Year,
     data = _)

Call:
lm(formula = Count ~ Year, data = summarize(group_by(allison_f, 
    Year), Count = sum(Count)))

Coefficients:
(Intercept)         Year  
   209815.1       -101.6  

The regression equation is:

predicted number of babies = 209815.1 - 101.6(Year)

Question 6

Code
allison_f |>
  group_by(Year) |>
  summarize(
    Count = sum(Count)
  ) |>
  lm(Count ~ Year,
     data = _) |>
  broom::augment() |>
  ggplot(mapping = aes(y = .resid, x = .fitted)) +
  geom_point(color = "darkslategray4") +
  labs(x = "Fitted", y = "Residuals", 
       title = "Residuals vs. Fitted Model on Number of Babies Named Allison from 1997 to 2014") +
  theme(text = element_text(family = "Optima"))

Question 7

What do you conclude from this model? Is my name not cool anymore?

Based on the residuals vs. fitted model, there appears to be a curved pattern. This demonstrates that there is a nonlinear relationship between the number of babies named Allison across the years. This proves that the number of babies named Allison have not been growing since the previous years.

Question 8

Narrow the A name dataset (downloaded previously) down to only male-assigned babies named “Allan”, “Alan”, or “Allen”. Make a plot comparing the popularity of these names over time.

Code
names |>
  rename("Sex" = "Gender") |>
  filter(Name %in% c("Allan", "Alan", "Allen"),
         Sex == "M") |>
  group_by(Name, Year) |>
  summarize(
    Count = sum(Count)
  ) |>
  ggplot(mapping = aes(x = Year, y = Count, color = Name)) +
    geom_line() +
    scale_y_continuous(n.breaks = 8) +
    scale_color_manual(values = c("darkslategray4", "aquamarine3", "cyan2")) +
    labs(x = "Year", y = "Count", 
         title = "Number of Babies Named Alan, Allan, or Allen from 1997 to 2014") +
  theme(text = element_text(family = "Optima"))

Question 9

In California, Allan’s spelling of his name is the least common of the three but perhaps it’s not such an unusual name for his home state of Pennsylvania. Compute the total number of babies born with each spelling of “Allan” in the year 2000, in Pennsylvania and in California. Specifically, the table should have the following qualities: - each spelling should be its own column - each state should have its own row - a 0 (not an NA) should be used to represent locations where there were no instances of these names

Code
names |>
  rename("Sex" = "Gender") |>
  filter(Year == 2000,
         State %in% c("CA", "PA"),
         Name %in% c("Allan", "Alan", "Allen"),
         Sex == "M"
         ) |>
  select(Name, State, Count) |>
  group_by(Name)|>
  pivot_wider(names_from = Name,
              values_from = Count)|>
  knitr::kable(format = "html",
               col.names =
                 c("State", "Alan", "Allen", "Allan"),
               caption = "Number of Male Babies Born in 2000 by Name and State"
               ) |>
  kableExtra::kable_classic(html_font = "Optima") |>
  kable_styling(bootstrap_options = "striped", font_size = 14)
Number of Male Babies Born in 2000 by Name and State
State Alan Allen Allan
CA 579 176 131
PA 51 56 12

Question 10

Convert your total counts to overall percents. That is, what was the percent breakdown between the three spellings in CA? What about in PA?

Code
names |>
  rename(Sex = Gender) |>
  filter(Year == 2000,
         State %in% c("CA", "PA"),
         Name %in% c("Allan", "Alan", "Allen"),
         Sex == "M"
         ) |>
  group_by(Name)|>
  mutate(
    Percentage = Count/sum(Count)
  ) |>
  select(Name, State, Percentage) |>
  pivot_wider(names_from = Name,
              values_from = Percentage)|>
  gt() |>
  tab_header(title = "Percentage of Male Babies Born in 2000 by Name and State") |>
  fmt_percent(columns = 2:4, decimals = 2) 
Percentage of Male Babies Born in 2000 by Name and State
State Alan Allen Allan
CA 91.90% 75.86% 91.61%
PA 8.10% 24.14% 8.39%